In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
In [2]:
import nltk
In [3]:
messages = pd.read_csv('SMS Spam Collection/SMSSpamCollection',sep='\t',names=['Label','Message'])
In [4]:
messages.head()
Out[4]:
In [5]:
messages['Length'] = messages['Message'].apply(len)
In [6]:
messages.head()
Out[6]:
In [50]:
messages.describe(include=[np.object])
Out[50]:
In [51]:
messages.describe()
Out[51]:
In [8]:
messages.groupby('Label').describe()
Out[8]:
In [9]:
sns.distplot(messages['Length'])
Out[9]:
In [10]:
histbylabel = sns.FacetGrid(messages, col='Label', margin_titles=True, size=4, sharex=False, sharey=False)
histbylabel.map(plt.hist, 'Length', edgecolor='black', lw=0.1, bins=50);
Note that the x and y axes for the two plots above are not the same.
Ham messages tend to be shorter than spam messages.
In [11]:
import string
from nltk.corpus import stopwords
In [12]:
def strippunc(message):
nopunc = [c for c in message if c not in string.punctuation]
nopunc = ''.join(nopunc)
return [word for word in nopunc.split()]
In [13]:
def stripstopwords(message):
nostripwords = [word for word in message.split() if word.lower() not in stopwords.words('english')]
return ' '.join(nostripwords).split(" ")
In [14]:
def stripuseless(message):
noextras = [character for character in message if character not in string.punctuation]
noextras = ''.join(noextras)
return [word for word in noextras.split() if word.lower() not in stopwords.words('english')]
In [15]:
messages['Message'].head().apply(strippunc)
Out[15]:
In [16]:
messages['Message'].head().apply(stripstopwords)
Out[16]:
In [17]:
messages['Message'].head().apply(stripuseless)
Out[17]:
In [18]:
from sklearn.model_selection import train_test_split
In [19]:
training_messages, test_messages, train_labels, test_labels = train_test_split(messages['Message'], messages['Label'], test_size = 0.33, random_state=47)
In [20]:
from sklearn.feature_extraction.text import CountVectorizer
In [21]:
bagofwords_transformer = CountVectorizer(analyzer=stripuseless)
bagofwords_transformer.fit(training_messages)
Out[21]:
In [22]:
print(len(bagofwords_transformer.vocabulary_))
In [28]:
training_termdocmatrix = bagofwords_transformer.transform(training_messages)
In [29]:
training_termdocmatrix.shape
Out[29]:
In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
In [30]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(training_termdocmatrix)
Out[30]:
In [31]:
training_tfidfmatrix = tfidf_transformer.transform(training_termdocmatrix)
In [32]:
test_termdocmatrix = bagofwords_transformer.transform(test_messages)
test_tfidfmatrix = tfidf_transformer.transform(test_termdocmatrix)
Note: The above two steps could have also been accomplished by using scikit-learn's TfidfVectorizer class.
In [33]:
from sklearn.naive_bayes import MultinomialNB
In [35]:
spam_detection_model = MultinomialNB().fit(training_tfidfmatrix, train_labels)
In [36]:
spam_detection_model.predict(training_tfidfmatrix)[3]
Out[36]:
In [37]:
train_labels[3]
Out[37]:
In [46]:
spam_detection_model.score(training_tfidfmatrix, train_labels)
Out[46]:
In [44]:
spam_detection_model.predict(test_tfidfmatrix)[3]
Out[44]:
In [43]:
test_labels.iloc[3]
Out[43]:
In [45]:
spam_detection_model.score(test_tfidfmatrix, test_labels)
Out[45]:
In [52]:
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
In [48]:
print(classification_report(test_labels, spam_detection_model.predict(test_tfidfmatrix)))
95% of messages predicted as ham were actually ham. 100% of actual ham messages were predicted as ham.
100% of messages predicted as spam were actually spam. 68% of actual spam messages were predicted as spam.
In [64]:
with sns.axes_style({'axes.grid':False}):
cm_fig, cm_ax = plot_confusion_matrix(confusion_matrix(test_labels, spam_detection_model.predict(test_tfidfmatrix)))
cm_ax.set_xticklabels(['','ham','spam'])
cm_ax.set_yticklabels(['','ham','spam'])
In [ ]: